Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
Read data
all_data = pd.read_csv('prosperLoanData.csv')
Exploring the data
all_data.head()
all_data.shape
all_data.info()
all_data.duplicated().value_counts()
This dataset contains 113,937 rows and 81 columns.
each row has information on the borrow's APR, status, borrowed amount, debt, etc.
Variables with many missing values will be dropped to make the Dataset more accurate.
Outliers were also will be removed to provide more reliable Data.
Im most interested in figuring out what features are best for predicting the price of the diamonds in the dataset.
I expect that carat will have the strongest effect on each diamonds price: the larger the diamond, the higher the price. I also think that the other big "C"s of diamonds: cut, color, and clarity, will have effects on the price, though to a much smaller degree than the main effect of carat.
Selecting 17 features for visualization:
data = all_data[['LoanKey','Term','LoanStatus','BorrowerAPR','BorrowerRate',
'ListingCategory (numeric)','BorrowerState','Occupation',
'EmploymentStatus','LoanOriginalAmount','CreditScoreRangeLower',
'CreditScoreRangeUpper','DebtToIncomeRatio','Investors','StatedMonthlyIncome',
'MonthlyLoanPayment','IncomeVerifiable']]
data.info()
Removing duplicate data
data.duplicated().value_counts()
final_data = data.drop_duplicates()
final_data.duplicated().value_counts()
final_data.isnull().sum()
Removing null data
final_data = final_data.dropna()
final_data.isna().sum()
final_data.shape
final_data.head()
final_data.info()
Number of rows in dataset are 97022 and Number of Columns in dataset are 17.
final_data['Term'].value_counts()
final_data['Term'].value_counts().values[0]
sns.countplot(x = 'Term',data = final_data,
color = sns.color_palette()[0],order = final_data['Term'].value_counts().index)
plt.xlabel('Term (Months)')
# Adding counts of each term in our data on top of of each bar.
for i in range (final_data['Term'].value_counts().shape[0]):
count = final_data['Term'].value_counts().values[i]
plt.text(i, count + 3000, count, ha = 'center', va = 'top')
plt.title('The length of the loan expressed in months.');
we can say that the length of the loan (in months) is maximum for 36 months and minimum for 12 months.
# Plotting histogram.
bins = np.arange(0, final_data['BorrowerRate'].max()+0.01,0.01)
plt.hist(x = 'BorrowerRate', data = final_data, bins = bins)
plt.title('Distribution of BorrowerRate')
plt.xlabel('Borrower Rate')
plt.ylabel('Count');
It is a bimodal distribution with one peak lying between 0.15 and 0.17 and another peak lying between 0.30 and 0.33.
It is a right-skewed distribution.
After 0.16 rate there is a decrease in the count but between 0.32 and 0.34 there is a sudden increase in the count of borrower rate.
final_data['LoanStatus'].value_counts()
sns.countplot(y = 'LoanStatus',data = final_data,
color = sns.color_palette()[4])
plt.title('Displaying counts for each loan status')
Loan status "Current" has the most number of counts
Loan status "Completed" have second most number of counts.
Category names for each numerical value
list_cat={0:'Not Available',1:'Debt Consolidation',2:'Home Improvement',
3:'Business',4:'Personal Loan',5:'Student Use',6:'Auto',
7:'Other',8:'Baby&Adoption',9:'Boat',10:'Cosmetic Procedure',
11:'Engagement Ring',12:'Green Loans',13:'Household Expenses',
14:'Large Purchases',15:'Medical/Dental',16:'Motorcycle',
17:'RV',18:'Taxes',19:'Vacation',20:'Wedding Loans'}
l=[]
for i in range(final_data.shape[0]):
for j in range(len(list_cat)):
if final_data['ListingCategory (numeric)'].values[i] == list(list_cat.keys())[j]:
l.append(list(list_cat.values())[j])
Assigning a new column with listing labels.
final_data['ListingLabels'] = l
final_data.head(5)
plt.figure(figsize = (10,8))
sns.countplot(y = 'ListingLabels',data = final_data,
order = final_data['ListingLabels'].value_counts().index,color=sns.color_palette()[6])
plt.title('Displaying counts of listing labels in our data');
This means that the maximum number of loans is taken for the debt consolidation category.
This may mean that many borrowers already have existing debts which is why they take debt consolidation loans in order to reduce the burden of existing debts and offer a new loan every month at low interest.
fig = px.choropleth(locations = final_data['BorrowerState'].value_counts().index,
locationmode = "USA-states", color = final_data['BorrowerState'].value_counts().values,
scope = "usa", hover_name = final_data['BorrowerState'].value_counts().index,
hover_data = {'Borrowers':final_data['BorrowerState'].value_counts().values})
fig.layout.coloraxis.colorbar.title = 'Number of Borrowers'
fig.layout.title = 'Map showing states and the number of borrowers from each state in the United States.'
fig.show()
plt.figure(figsize = (12,9))
sns.countplot(y = 'BorrowerState', data = final_data,
color = sns.color_palette()[3], order = final_data['BorrowerState'].value_counts().index)
plt.title('Displaying counts for each state in data')
The graphs shows that maximum number 13106 of borrowers are from CA (California).
presented by yellow color in choropleth
occupations = final_data['Occupation'].value_counts()
occupations
occupations[occupations > 10000].plot.barh(color = 'Crimson',rot = 0)
plt.title('Displaying occupation with counts > 10000')
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.show()
occupations[(occupations > 1000) & (occupations < 5000)].plot.barh(color = 'Crimson', rot = 0)
plt.title('Displaying occupation with counts > 1000 and < 5000')
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.show()
occupations[(occupations > 200) & (occupations < 1000)].plot.barh(color = 'Crimson',rot = 0)
plt.title('Displaying occupation with counts greater than 200 and less than 1000')
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.show()
occupations[occupations < 200].plot.barh(color = 'Crimson', rot = 0)
plt.title('Displaying occupation with counts < 200')
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.show()
we can see that "other" category has the highest count and "Student Technical School" has the lowest count.
final_data['EmploymentStatus'].value_counts()
plt.figure(figsize = (14,7))
sns.countplot(x = 'EmploymentStatus', data = final_data, color = 'GREEN')
plt.xticks(rotation = 25)
plt.title('Displaying counts of employment status');
The plot shows that the maximum number of borrowers are Employed and the minimum number are Not employed
final_data['LoanOriginalAmount'].describe()
bins = np.arange(1000, final_data['LoanOriginalAmount'].max() + 40, 500)
plt.hist(x = 'LoanOriginalAmount', data = final_data, bins = bins)
plt.title('Distribution of loan original amount')
plt.xlabel('Loan original amount($)');
np.log10(final_data['LoanOriginalAmount'].describe())
From the graph shows that:
outliers_dti = (final_data['DebtToIncomeRatio']>1)
out_df = final_data.loc[outliers_dti,:]
filter_df = final_data.loc[-outliers_dti,:]
filter_df.head()
filter_df.info()
loan_df = filter_df.reset_index()
loan_df = loan_df.drop('index',axis=1)
loan_df.info()
find the correlation using heatmap between numerical variables
numeric_var = ['Term','BorrowerAPR','BorrowerRate','LoanOriginalAmount',
'CreditScoreRangeLower','CreditScoreRangeUpper','DebtToIncomeRatio',
'Investors','StatedMonthlyIncome']
plt.figure(figsize = (10,8))
sns.heatmap(data = loan_df[numeric_var].corr(), annot = True, fmt = '.3f')
plt.title('Correlation');
samples = np.random.choice(loan_df.shape[0], 10000, replace = False)
loan_samp = loan_df.loc[samples,:]
pg = sns.PairGrid(data = loan_samp, vars = numeric_var,height=2)
pg = pg.map_diag(plt.hist, bins = 20)
pg.map_offdiag(plt.scatter,alpha=1/6)
pg.fig.suptitle('Displaying how numerical features are related with each other',y=1.02);
From the graph shows that:
Removing outliers from credit score range upper and credit score range lower variables with values below 300
loan_df[['CreditScoreRangeLower','CreditScoreRangeUpper']].describe()
outlier_data = (loan_df['CreditScoreRangeLower']<300) | (loan_df['CreditScoreRangeUpper']<300)
loan_df = loan_df.loc[-outlier_data,:]
loan_df = loan_df.reset_index()
plt.figure(figsize = (16,8))
sns.heatmap(data=loan_df[numeric_var].corr(),annot=True,fmt='.3f')
plt.title('Correlation of numerical variables.');
samples = np.random.choice(loan_df.shape[0], 10000, replace = False)
loan_samp = loan_df.loc[samples,:]
pg = sns.PairGrid(data = loan_samp, vars = numeric_var,height = 2)
pg = pg.map_diag(plt.hist, bins = 20)
pg.map_offdiag(plt.scatter,alpha = 1/6)
pg.fig.suptitle('Displaying how numerical features are related with each other',y = 1.02);
After removing outliers:
Converting credit score range from numerical dataype to categorical data type.
credit_score = np.array(loan_df['CreditScoreRangeLower'])
dictionary = {'Exceptional':np.arange(800,881,1),
'Very Good':np.arange(740,800,1),
'Good':np.arange(670,740,1),
'Fair':np.arange(580,670,1),
'Poor':np.arange(300,580,1)}
list_=[]
ns=[]
for i in range(credit_score.shape[0]):
for key,val in dictionary.items():
if credit_score[i] in val:
list_.append(key)
loan_df = loan_df.assign(credit_score_class = list_)
# Convertin dataype to categorical.
credit_class = ['Poor','Fair','Good','Very Good','Exceptional']
t = pd.api.types.CategoricalDtype(categories = credit_class, ordered = True)
loan_df['credit_score_class'] = loan_df['credit_score_class'].astype(t)
Borrower APR and credit score class variable.
sns.violinplot(data = loan_df, x = 'credit_score_class', y = 'BorrowerAPR',
color = sns.color_palette()[0], innner = None)
plt.xticks(rotation = 15)
plt.title('Relation between credit score class and APR');
Borrower APR and loan status variable.
plt.figure(figsize = (12,10))
sns.boxplot(x = 'LoanStatus', y = 'BorrowerAPR',
data = loan_df,color = sns.color_palette()[8])
plt.xticks(rotation = 25)
plt.title('Relation between APR and loan status');
bins = np.arange(0.01, 0.4 + 0.01 , 0.01)
f = sns.FacetGrid(data = loan_df, col = 'LoanStatus',
col_wrap = 4, sharey = False,
sharex = False, height = 4.5)
f.map(plt.hist,'BorrowerAPR', bins = bins)
f.fig.suptitle('Borrower APR vs Loan status', y = 1.02);
plt.figure(figsize = (14,7))
sns.pointplot(data = loan_df , x = 'LoanStatus', y ='BorrowerAPR',
join = False,ci='sd',color = sns.color_palette()[9])
plt.xticks(rotation = 25)
plt.title('Relation between loan status and APR')
plt.figure(figsize = (14,7))
sns.violinplot(data = loan_df,x = 'LoanStatus',y = 'BorrowerAPR',
inner = 'quartile', color = sns.color_palette()[8])
plt.xticks(rotation = 25)
plt.title('Relation between loan status and APR')
Borrower APR and Borrower Rate variable.
for lab in ['BorrowerAPR','BorrowerRate']:
bins = np.arange(0, loan_df[lab].max() + 0.04, 0.04)
bins = [round(x,2) for x in bins]
loan_df[f'{lab} cat'] = pd.cut(x = loan_df[lab], bins = bins,
labels = bins[1:], include_lowest = False)
plt.figure(figsize = (18, 7))
sns.heatmap(loan_df.groupby(['BorrowerAPR cat','BorrowerRate cat']).size().unstack('BorrowerRate cat').replace({0:np.nan}),
annot = True, fmt = '.0f', cmap = 'viridis_r' , vmin = 0 ,
cbar_kws = {'label':'Count of borrower Rate with respect to borrower APR'})
plt.title('Borrower Rate affect on Borrower APR')
plt.xlabel('Borrower Rate')
plt.ylabel('Borrower APR')
plt.yticks(rotation = 0)
plt.show()
From above heatmap:
Borrower APR and Loan Term quantitative variables.
plt.figure(figsize = (12,10))
sns.regplot(data = loan_df, x = 'Term', y = 'BorrowerAPR',
truncate = False, x_jitter = 2, scatter_kws = {'alpha':1/30})
plt.title('Distribution of loan term w.r.t APR');
plt.figure(figsize = (16,8))
plt.subplot(1,2,1)
sns.boxplot(x = 'Term', y = 'BorrowerAPR', data = loan_df,
color = sns.color_palette()[2])
plt.title('Term vs Borrower APR')
plt.subplot(1,2,2)
sns.violinplot(x = 'Term',y = 'BorrowerAPR', data = loan_df,
color = sns.color_palette()[4], inner = 'quartile')
plt.title('Term vs Borrower APR');
I have made several observations from above three plots:
Borrower APR and Loan original amount quantitative variables.
plt.figure(figsize = (14,7))
sns.scatterplot(x = 'LoanOriginalAmount', y = 'BorrowerAPR',
data = loan_df, x_jitter = 0.3, alpha =1/4)
plt.title('Loan original amount vs APR')
We can see that there is a gradual decrease in Borrower APR with loan amount still it dosn't give a clear picture of count of APR between a given range of loan amount.
Add annotations and transform x-axis and y-axis bins to get more accurate data representation
plt.figure(figsize = (18,7))
bins_x2 = np.arange(1000, 35000 + 5000, 3000)
bins_y2 = np.arange(0.05, 0.40 + 0.06, 0.04)
labels = ['{}'.format(x) for x in bins_x2]
gt = plt.hist2d(y = 'BorrowerAPR',x = 'LoanOriginalAmount', data = loan_df,
cmin = 0.5, cmap = 'viridis_r', bins = [bins_x2,bins_y2])
count = gt[0]
for i in range(count.shape[0]):
for j in range(count.shape[1]):
annot = count[i,j]
if annot!=np.nan and annot >= 2000:
plt.text(x=gt[1][i]+50,y=gt[2][j]+0.005,s=int(annot),color='white')
elif annot !=np.nan and annot > 0:
plt.text(x=gt[1][i]+50,y=gt[2][j]+0.005,s=int(annot),color='black')
plt.colorbar()
plt.xticks(bins_x2,labels)
plt.title('Influence of loan original amount on APR')
plt.xlabel('LOA')
plt.ylabel('APR');
Using the above heatmap, following observations have been made:
- Count of borrowers APR between 1000-7000 approx. is highest.
- Highest count of APR between 1000-7000 is 5428 which is situated at 0.35 approx.
- As loan original amount is increasing, borrower APR is decreasing and count of borrower APR is decreasing too. (yellow boxes after 16000 loan amount)
It is now getting more clear that the borrowers who took small amount of loans have highest count as well as have highest APR values. But I want to see why borrowers with small loan amount have high APR and also what are the reasons behind it?. Is it because of type of loan or because of high debt to income ratio or due to occupation etc.
Plotting violin plot using seaborn's pairgrid with APR values >=0.20.
bad_apr = loan_df[loan_df['BorrowerAPR'] >= 0.2]
g = sns.PairGrid(data = bad_apr, y_vars = ['LoanOriginalAmount','BorrowerAPR'],
x_vars = 'EmploymentStatus', height = 4.9, diag_sharey = False, aspect = 3)
g.map(sns.violinplot, inner = 'quartile', color = sns.color_palette()[1])
g.fig.suptitle('Employment status vs loan original amount (APR >=0.20)')
plt.title('Employment status vs Borrower APR (APR >=0.20)')
plt.show()
- For every employment status most of the loan original amount is between 1000 and 5000.
- Not employed status has the least count but their APR ranges from 0.15 to 0.42 approx. Also, most of the APR values lie between median (second quartile 0.28 approx.) and third quartile (0.32 approx.).
- Not available status has 1647 counts whose APR is >= 0.20. Most of the APR values are present between 0.28 and 0.32.
Nearly for every employment status, most of the loan amount lies in same range and also, borrowers who are employed seems to have APR normally distributed compare to other borrowers APR.
So, according to me, considering employment status is not a good idea to find out that why borrower APR is high for some loan amount and low for another and also what affects borrower APR. I will try to explore other categorical variables and compare them with borrower APR and Borrower Rate furhter in my exploratory analysis.
Occupation ,Borrower APR qualitative and quantitative variables respectively.
occ_apr = loan_df.groupby('Occupation')['BorrowerAPR'].mean().reset_index()
occ_apr
plt.figure(figsize = (10,20))
plt.errorbar(y = 'Occupation', x = 'BorrowerAPR', data = occ_apr)
plt.title('Occupation vs borrower APR');
We can can't make clear interpretation about which occupations have low average apr and which has highest APR.
filter1 = occ_apr[occ_apr['BorrowerAPR'] < 0.2]
f2 = occ_apr['BorrowerAPR'].between(0.2, 0.21, inclusive = False)
filter2 = occ_apr.loc[f2,:]
f3 = occ_apr['BorrowerAPR'].between(0.21, 0.22, inclusive = False)
filter3 = occ_apr.loc[f3,:]
f4 = occ_apr['BorrowerAPR'].between(0.22, 0.23, inclusive = False)
filter4 = occ_apr.loc[f4,:]
f5 = occ_apr['BorrowerAPR'].between(0.23, 0.25, inclusive = False)
filter5 = occ_apr.loc[f5,:]
# Line plot for APR less than 0.20
plt.figure(figsize = [18,7])
plt.scatter(x='Occupation',y='BorrowerAPR',data=filter1)
plt.errorbar(x='Occupation',y='BorrowerAPR',data=filter1,c=sns.color_palette()[4],linestyle='--',)
plt.xticks(rotation=20)
plt.ylabel('Average borrower APR')
plt.title('Occupation vs Average borrower APR (<0.2)')
plt.xlabel('Occupation')
# Line plot for APR greater than 0.20 and less than 0.21
plt.figure(figsize = [18,7])
plt.scatter(x = 'Occupation', y = 'BorrowerAPR', data = filter2)
plt.errorbar(x = 'Occupation', y = 'BorrowerAPR', data = filter2,
c = sns.color_palette()[4], linestyle = '-.',)
plt.xticks(rotation = 20)
plt.title('Occupation vs Average borrower APR (between 0.2 and 0.21)')
plt.xlabel('Occupation')
plt.ylabel('Average borrower APR')
# Line plot for APR greater than 0.21 and less than 0.22
plt.figure(figsize = [18,7])
plt.scatter(x = 'Occupation', y = 'BorrowerAPR', data = filter3)
plt.errorbar(x = 'Occupation',y = 'BorrowerAPR', data = filter3,
c = sns.color_palette()[4], linestyle = ':',)
plt.xticks(rotation = 20)
plt.title('Occupation vs Average borrower APR (between 0.21 and 0.22)')
plt.xlabel('Occupation')
plt.ylabel('Average borrower APR')
# Line plot for APR greater than 0.22 and less than 0.23
plt.figure(figsize = [18,7])
plt.scatter(x = 'Occupation', y = 'BorrowerAPR', data = filter4)
plt.errorbar(x = 'Occupation', y = 'BorrowerAPR', data = filter4,
c = sns.color_palette()[4], linestyle = '--')
plt.xticks(rotation = 25)
plt.title('Occupation vs Average borrower APR (between 0.22 and 0.23)')
plt.xlabel('Occupation')
plt.ylabel('Average borrower APR')
# Line plot for APR greater than 0.23 and less than 0.25
plt.figure(figsize = [18,7])
plt.scatter(x = 'Occupation', y = 'BorrowerAPR', data = filter5)
plt.errorbar(x = 'Occupation', y = 'BorrowerAPR', data = filter5,
c = sns.color_palette()[5], linestyle= 'dotted',)
plt.xticks(rotation = 20)
plt.title('Occupation vs Average borrower APR (between 0.23 and 0.25)')
plt.xlabel('Occupation')
plt.ylabel('Average borrower APR');
Explore credit class and loan status.
# Plotting countplot using seaborn's facetgrid.
ls = sns.FacetGrid(data = loan_df, col = 'LoanStatus',
col_wrap = 3, height = 5, sharey = False, sharex = False)
ls.map(sns.countplot, 'credit_score_class', order = ['Poor','Fair','Good','Very Good','Exceptional'])
ls.fig.suptitle('Loan status vs credit score class', y = 1.02);
From above facet plot I have made some observations:
Borrower APR and Debt To Income Ratio.
bins_edges = np.arange(0, 1 + 0.1 , 0.01)
bins_center = bins_edges[:-1] + 0.0025
# Cut the bin values into discrete intervals. Returns a Series object.
displ_binned = pd.cut(loan_df['DebtToIncomeRatio'], bins_edges, include_lowest = True)
displ_binned
bins_edges = np.arange(0, 1 + 0.1 , 0.01)
bins_center = bins_edges[:-1] + 0.0025
displ_binned = pd.cut(loan_df['DebtToIncomeRatio'], bins_edges, include_lowest = True)
plt.figure(figsize = [14, 7])
comb_mean = loan_df['BorrowerAPR'].groupby(displ_binned).mean()
comb_std = loan_df['BorrowerAPR'].groupby(displ_binned).std()
# Plot the summarized data
plt.scatter(data = loan_df, x = 'DebtToIncomeRatio',
y = 'BorrowerAPR', alpha = 1/20)
plt.scatter(x = bins_center, y = comb_mean)
plt.errorbar(x = bins_center, y = comb_mean, c = sns.color_palette('dark')[3])
plt.errorbar(x = bins_center, y = comb_std, c = sns.color_palette('dark')[3], linestyle = '--')
plt.title('Debt to income ratio vs Average borrower APR (Standard deviation,Mean)')
plt.xlabel('Debt to income ratio (changed bins size)')
plt.ylabel('Average borrower APR');
Their a weak positive correlation which indicate that while both variables DTI and APR tend to go up in response to one another, the relationship isn't very strong.
Explore Borrower APR and Investors variables.
bins_edges = np.arange(1, 1189 + 100, 100)
bins_center = bins_edges[:-1] + 50
# Cut the bin values into discrete intervals. Returns a Series object.
displ_binned = pd.cut(loan_df['Investors'], bins_edges, include_lowest = True)
displ_binned
plt.figure(figsize = [14,7])
comb_mean = loan_df['BorrowerAPR'].groupby(displ_binned).mean()
comb_std = loan_df['DebtToIncomeRatio'].groupby(displ_binned).std()
# Plot the summarized data
plt.errorbar(x = bins_center, y = comb_mean)
plt.xticks(rotation = 15);
plt.xlabel('investors')
plt.ylabel('Average APR')
plt.title('Investors vs Average APR');
Their a moderate negative correlation which indicate that while both variables Investors and APR tend to go down in response to one another, the relationship is moderately strong.
Borrower APR was high for less loan original amount and low for high loan amount. Also, Borrower APR and debt to income ratio although tend to go up, had weak correlation whereas borrower APR was weaky negative correlated with monthly income.
Explore Loan Status with Borrower APR and Borrower Rate
# Plotting heatmap using seaborn's facetgrid.
bins_x = np.arange(0,loan_df['BorrowerAPR'].max() + 0.01,0.02)
bins_y = np.arange(0,loan_df['BorrowerRate'].max() + 0.01,0.03)
g = sns.FacetGrid(data = loan_df, col = 'LoanStatus', col_wrap = 4, height = 4.7)
g.map(plt.hist2d, 'BorrowerAPR', 'BorrowerRate',bins=[bins_x,bins_y], cmap = 'viridis_r', cmin = 0.5)
g.set_xlabels('APR')
g.set_ylabels('Rate')
g.fig.suptitle('Loan status by borrower rate and APR', y = 1.01)
plt.show()
I have made several observations:
Explore loan status , Loan Term , loan amount and debt to income ratio.
I want to see what features affect loan status.
# Plotting heatmap using seavorn's facetgrid
g = sns.FacetGrid(data = loan_df, col = 'LoanStatus', col_wrap = 4, height = 4, sharex = False, sharey = False)
g.map(plt.hist2d, 'LoanOriginalAmount', 'DebtToIncomeRatio',cmap = 'inferno_r',cmin = 0.5)
g.set_xlabels('loa')
g.set_ylabels('dti')
g.fig.suptitle('Loan status by DTI and loan amount', y = 1.01)
plt.colorbar()
plt.show()
# Plotting heatmap for 'Term' = 12.
term12 = loan_df[loan_df['Term'] == 12]
g = sns.FacetGrid(data = term12, col = 'LoanStatus',row='credit_score_class', sharex = False,
sharey = False, height = 5, aspect = 0.85)
g.map(plt.hist2d,'LoanOriginalAmount','DebtToIncomeRatio',cmap = 'inferno_r', cmin = 0.5)
g.set_xlabels('loa')
g.set_ylabels('dti')
g.fig.suptitle('Loan status by loan amount, credit score class and DTI',y = 1.01)
plt.show()
# Plotting heatmap for 'Term' = 36
term36 = loan_df[loan_df['Term'] == 36]
g = sns.FacetGrid(data = term36, col = 'LoanStatus',row = 'credit_score_class',
sharex = False, sharey = False, height = 5, aspect = 0.85)
g.map(plt.hist2d,'LoanOriginalAmount','DebtToIncomeRatio',cmap = 'inferno_r', cmin = 0.5)
g.set_xlabels('loa')
g.set_ylabels('dti')
g.fig.suptitle('Loan status by loan amount,credit score class and DTI',y=1.01)
plt.show()
# Plotting heatmap for 'Term' = 60.
term60 = loan_df[loan_df['Term'] == 60]
g = sns.FacetGrid(data = term60, col = 'LoanStatus',row = 'credit_score_class',
sharex = False, sharey = False, height = 5, aspect = 0.85)
g.map(plt.hist2d,'LoanOriginalAmount','DebtToIncomeRatio',cmap = 'inferno_r', cmin = 0.5)
g.set_xlabels('loa')
g.set_ylabels('dti')
g.fig.suptitle('Loan status by loan amount,credit score class and DTI',y = 1.01)
plt.show()
I have made several observations:
Explore credit class, monthly income, and loan status
Grouping data by credit score class,loan status and calculating mean of monthly income for each grouping variable.
mean_income = loan_df.groupby(['credit_score_class','LoanStatus'])['StatedMonthlyIncome'].mean()
mi = mean_income.reset_index()
# Plotting pointplot by taking log transformation on y-axis.
fig = plt.figure(figsize = [18,6])
ax = sns.pointplot(data = mi, x = 'credit_score_class', y = 'StatedMonthlyIncome', hue = 'LoanStatus',
palette = 'Blues', linestyles = '', dodge = 0.4)
plt.yscale('log')
plt.yticks([2e3, 3e3, 4e3, 5e3, 6e3, 8e3, 10e3],
['2000', '3000','4000','5000','6000','8000','10000'])
ax.set_yticklabels([],minor = True)
plt.legend(loc = 2)
plt.title('Stated monthly income by credit score class and loan status (log-scale)')
plt.show();
With loan status and change in credit score from 'Poor' to 'Exceptional' we can observe increase in stated monthly income.
# Plotting heatmap using seaborn's facetgrid.
bins_x = np.arange(0, loan_df['BorrowerAPR'].max() + 0.01, 0.02)
bins_y = np.arange(0, loan_df['BorrowerRate'].max() + 0.01, 0.03)
g = sns.FacetGrid(data = loan_df, col = 'credit_score_class',
sharex = False, sharey = False, col_wrap = 3, height = 3)
g.map(plt.hist2d, 'BorrowerAPR', 'BorrowerRate',bins = [bins_x,bins_y], cmap = 'viridis_r', cmin = 0.5)
g.set_xlabels('APR')
g.set_ylabels('Rate')
g.fig.suptitle('Credit score class by APR and borrower Rate',y = 1.02)
plt.show()
Maximum borrowers with credit score 'Good', 'Very Good' and 'Exceptional' had borrower APR and borrower Rate below 0.20.
# Plotting heatmap using seaborn's facetgrid.
bins_x = np.arange(0,loan_df['BorrowerAPR'].max() + 0.01, 0.02)
bins_y = np.arange(0,loan_df['BorrowerRate'].max() + 0.01, 0.03)
g = sns.FacetGrid(data = loan_df, col = 'EmploymentStatus',
sharex = False, sharey= False , col_wrap = 3, height = 3)
g.map(plt.hist2d, 'BorrowerAPR', 'BorrowerRate',bins = [bins_x,bins_y], cmap = 'viridis_r', cmin = 0.5)
g.set_xlabels('APR')
g.set_ylabels('Rate')
g.fig.suptitle('Employment status by borrower Rate and APR',y = 1.02)
plt.show()
# Plotting heatmap using seaborn's facetgrid.
bins = (np.arange(0, 1 + 0.01, 0.02))**1/3
tick = [0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1]
labels = ['{}'.format(v)for v in tick]
bins_y = np.log10(np.arange(1000, loan_df['LoanOriginalAmount'].max()+ 40, 1500))
tick_y = []
g = sns.FacetGrid(data = loan_df, col = 'EmploymentStatus', sharex = False,
sharey = False, col_wrap = 3, height = 3)
g.map(plt.hist2d, 'LoanOriginalAmount', 'DebtToIncomeRatio', cmap = 'viridis_r', cmin = 0.5)
g.set_xlabels('loa')
g.set_ylabels('dti')
g.fig.suptitle('Employment status by debt to income ratio and loan original amount', y = 1.02)
plt.show()
Many borrowers having 'self employed' as employment status, have had loan original amount close to 25000.
Borrowers with 'not employed' as employment staus are very less in our data and also most of them have loan amount close to 2500 only with APR cloase to 0.20.